rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ade4)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(readxl)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

PRELIMINARY SETTINGS

Set working directory

setwd(dir="/srv/mfs/hausserlab/anissa.el/ImmuneStates/wagner_analysis")
source("./scripts/R/Wagner_Keren_functions.r")
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:plotly':
## 
##     groups
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
MacroMicroData <- readRDS(file="./outputs/MacroMicroData.rds")
SitesBBKeren <- readRDS(file = "../data_BB/archetypesSitesKeren.rds")

First, let’s have a look at the cell types to see if there are some constraints that building blocks can explain :

pairs(MacroMicroData$MacroTumorsW, pch = 19)

LINEAR REGRESSION

VARIABLE TO EXPLAIN : cellular abundance of BC tumors (Wagner dataset)

  • EXPLANATORY VARIABLES :
  • BUILDING BLOCK 1 (TLS)
  • BUILDING BLOCK 2 (INFLAMMATORY BLOCK)
  • BUILDING BLOCK 3 (CANCER BLOCK)
  • BUILDING BLOCK 3 (Low density component)
  • All variables are numerical and continuous. Building blocks are vectors of size 10(number of cell types)
  • The observed data (Y) is the dataset of Wagner tumors of size, 134 tumors x 10 cell types whose row sums are all equal to 100 (percentages)
Y <- MacroMicroData$MacroTumorsW
BB1 <- MacroMicroData$BBs[1,]
BB2 <- MacroMicroData$BBs[2,]
BB3 <- MacroMicroData$BBs[3,]
BB4 <- MacroMicroData$BBs[4,]
B <-t(MacroMicroData$BBs)

Linear regression on the dataset of Wagner using lm function:

model1 <- lm(B~t(Y)+0)  #lm(t(Y)~BB1 + BB2 + BB3 )


lm_tumors <- function(Y,BB1,BB2,BB3,BB4){
 results_lm <-data.frame()
  for (i in 1:nrow(Y)){
    model2 <- lm(Y[i,]~BB1+BB2 + BB3 + BB4+0)
    #print(summary(model2))
    results_lm[i,1] <- model2[["coefficients"]][1]
    results_lm[i,2] <- model2[["coefficients"]][2]
    results_lm[i,3] <- model2[["coefficients"]][3]
    results_lm[i,4] <- model2[["coefficients"]][4]
    #results_lm$r2 <-model2$coefficients$BBB1
  }
 return(results_lm)
}

results_lm<- lm_tumors(Y,BB1,BB2,BB3,BB4)


fig <-plot_ly(x=results_lm[,1],
        y = results_lm[,2],
        z = results_lm[,3],
        type="scatter3d",
        mode="markers",
        name="Wagner BC tumors",
        showlegend=TRUE,
        marker=list(size=5, color =~results_lm[,3]))
bestcol <- rgb(0, 0, 255, max = 255, alpha = 125, names = "blue20")
fig <- fig%>%add_mesh(x=c(1,0,0),
                        y=c(0,1,0),
                        z=c(0,0,1),
                        name = "BB plane",
                        facecolor= bestcol,
                        opacity=0.3,
                        inherit=TRUE,
                        showlegend=TRUE)

fig <- fig %>%layout(scene = list(xaxis = list(title = "BB1"), yaxis = list(title = "BB2"), zaxis = list(title = "BB3") ),
                       title = "BC tumors from Wagner dataset fitted in the Building Blocks space")
fig
## Warning: 'mesh3d' objects don't have these attributes: 'mode', 'marker'
## Valid attributes include:
## 'type', 'visible', 'legendgroup', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'uirevision', 'x', 'y', 'z', 'i', 'j', 'k', 'text', 'hovertext', 'hovertemplate', 'delaunayaxis', 'alphahull', 'intensity', 'intensitymode', 'color', 'vertexcolor', 'facecolor', 'cauto', 'cmin', 'cmax', 'cmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'opacity', 'flatshading', 'contour', 'lightposition', 'lighting', 'hoverinfo', 'showlegend', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'isrc', 'jsrc', 'ksrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'intensitysrc', 'vertexcolorsrc', 'facecolorsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
#summary(model2)

In lm(), we consider that the response variable is as vector of observations, whereas here, we have just one observation of dimension 10 (we observe 10 cell types proportions per tumor). So the most convenient way is to compute this as a matrix equation.

Let’s do the same using matrix equations Y= B.Omega. The unknown variable is Omega, a matrix of size 3*134, this matrix contains the proportions of each building block (BB1 -> BB4) for each observation. The result of this equation is \(\Gamma\) = \((\)tB\(.\)B\()^-1\) \(\times\) t(Y)

#Binter <- B[B[]]
OmegaT <- solve(t(B) %*% B) %*% t(B) %*% t(Y)
# Round for easier viewing
Omega <- t(round(OmegaT, 2))
residual <- t(t(Y)- (B%*%OmegaT))
Ymean <-apply(Y, 1,function(x) mean(x))
R2 <- matrix(ncol = 1,nrow=nrow(Y))
for (i in 1:nrow(Y)){
  R2[i] <- cor((B%*%OmegaT[,i]),t(Y)[,i],method = "pearson")^2#1- ((apply(Y-t(residual)))^2/(sum(Y-Ymean)^2))
}
rownames(R2) <- rownames(Y)
colnames(R2) <- "R2"

# Create basic plot

boxplot(R2)
par(bg="aliceblue")
# Change the plot region color
rect(par("usr")[1], par("usr")[3],
     par("usr")[2], par("usr")[4],
     col = "aliceblue") 

# Add a new plot
par(new = TRUE)

# Create your plot
boxplot(R2,
        col = "coral2",
        main = "Boxplot or R^2 values of linear regression on BC tumors (Wagner data)",
        ylab= "R-square ",
        xlab=" ")

#badFit <- R2[R2[,"R2"]<0.6,] # We consider the outliers as the tumors for which the linear regression gives a Rsquare lower than 0.6

We have the same results as in the first method of linear regression. Most of the R square values are close to 1 with the minimum as 0.8, so the linear regression fitted quite well the values to a model with 4 explanatory variables as our building blocks.

Before including the 4th building block (low density cells component with macrophages and leukocytes), most of the R square values for each tumor sample were very close to 1 but had some outliers : this means that the linear model didn’t fit very well for some tumor samples. In other words, there were some samples from Wagner dataset that couldn’t be fully explained by a linear combination of 3 building blocks.

Let’s see the relationship between BB1 and BB2 :

fig3 <- ggplot(data=as_tibble(Omega,rownames=NA),aes(x=BB1,y=BB2)) + geom_point() + 
geom_point()
  ggtitle("Evolution of proportions of building blocks 1 and 2 in BC tumors (Wagner data)")
## $title
## [1] "Evolution of proportions of building blocks 1 and 2 in BC tumors (Wagner data)"
## 
## attr(,"class")
## [1] "labels"
fig3

Omega <- as_tibble(Omega,rownames=NA)
#modelbb12 <- lm(Omega$BB2~Omega$BB1+0)
#ummary(modelbb12)

There is clearly a linear dependency between TLS and inflammatory component. Now, we would like to know the ratio between those two components : apparently, for 1 TLS, there should be around 3.5 inflammatory blocks. A PCA on these two building blocks would help us to determine that result :

Linear regression with interaction

Interaction between inflammatory block and Tertiary Lymphoid block

We compute those coefficients in order to do create a new super building block which represents the linear dependency between the TLSs and the inflammatory block. Then, the correction will be done at the level of the matrix B of cellular composition of the building blocks. Thus, we can make a new linear regression and compute the new matrix Omega which would be Omega2 :

OmegaInflTLS <- Omega[,c("BB1","BB2")]
Omega<-as_tibble(Omega,rownames=NA)
pcaBB1BB2 <-dudi.pca(OmegaInflTLS,center=FALSE,scale=FALSE,scannf=FALSE, nf=2)
fviz_eig(pcaBB1BB2)

fviz_pca_biplot(pcaBB1BB2,col.var = 'indianred2',col.ind = 'steelblue2',invisible = 'ind', repel = TRUE)

plotbb12 <-plot(x=pcaBB1BB2$c1$CS1%*%t(Omega$BB1),y=pcaBB1BB2$c1$CS1%*%t(Omega$BB2),
     xlab="Building block 1",
     ylab= "Building block 2")

plotbb12
## NULL
coeffBB1BB2 <- pcaBB1BB2$co$Comp1 #/(sum(pcaBB1BB2$co$Comp1))
BB1BB2 <- (coeffBB1BB2[1]*B[,1] + coeffBB1BB2[2]*B[,2])/(coeffBB1BB2[1]+coeffBB1BB2[2])
B2<- cbind(B,BB1BB2)
B2 <- B2[,c("BB3","BB4","BB1BB2")]
#colnames(Omega)[5] <- "BB1BB2"

Let’s compute the new matrix Omega2 :

#Binter <- B[B[]]
OmegaT2 <- solve(t(B2) %*% B2) %*% t(B2) %*% t(Y)
# Round for easier viewing
Omega2 <- t(round(OmegaT2, 3))
residual <- t(t(Y)- (B2%*%OmegaT2))
Ymean <-apply(Y, 1,function(x) mean(x))
R22 <- matrix(ncol = 1,nrow=nrow(Y))
for (i in 1:nrow(Y)){
  R22[i] <- cor((B2%*%OmegaT2[,i]),t(Y)[,i],method = "pearson")^2 #1- ((apply(Y-t(residual)))^2/(sum(Y-Ymean)^2))
}
rownames(R22) <- rownames(Y)
colnames(R22) <- "values"
RsqValues <- as_tibble(R22,rownames = NA)%>%mutate(name = "BC patients")
boxplot(R22,main = "Boxplot or R^2 values of linear regression on Wagner dataset (BB1:BB2)",
        ylab = "R^2 ",
        xlab ="linear regression")

vlnplt1 <-ggplot(data = RsqValues,mapping = aes(x = name,y = `values`,fill="#EBB064"))+ #77CA5D
  geom_violin()+
  theme(legend.position = "none")+
   ylim(0,1)+
  xlab("")+ylab("R-square values")
vlnplt1

ggsave("./figs/VlnPlotR2.pdf",vlnplt1, height = 3, width = 4) 

ObsVSPred <- data.frame(Observed =t(Y)[,"T_BB160"], Predicted=(B2%*%OmegaT2[,"T_BB160"]))
#ObsVSPred$Observed <- t(Y)[,"T_BB160"]
#bsVSPred$Predicted <- (B2%*%OmegaT2[,"T_BB160"])

#T_BB160 sample 
ObsVSPred<- as_tibble(ObsVSPred,rownames=NA)%>%
  rownames_to_column(var="cell_type")%>%
  mutate(cell_type = set_cells_names(cell_type))


r2 <-round(RsqValues["T_BB160",]$values,2)
label1 <- as.expression(bquote(italic(r)^2== .(r2)))#expression(atop(paste(italic(r)^2," = ",.(format(r2,digits=2)))))





plot2 <-ggplot(data=ObsVSPred,aes(x=Predicted,y=Observed))+
  geom_point()+
  #geom_text(aes(label=cell_type),hjust=0, vjust=0)+
  ggrepel::geom_text_repel(aes(label=cell_type))+
  geom_abline(slope=1, intercept=0)+
  #stat_cor(method = "pearson", label.x = 3, label.y = 35)+
  annotate("text", x=5, y=33, label= label1)#expression(paste(italic(R)^2, " = ", format(r2, digits = 2)))
plot2
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

ggsave("./figs/obsVSpred_lm.pdf",plot2, height = 3, width = 4) 
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
#badFit <- R2[R2[,"R2"]<0.6,] # We consider the outliers as the tumors for which the linear regression gives a Rsquare lower than 0.6

Let’s now plot the results of the regression in 3D :

## Warning: `line.width` does not currently support multiple values.

We can see a triangle encompassed in the 3D-simplex and inside the plan colored in pink. The 3D simplex represents the physical constraints : positive proportions of building blocks (and 10 cell types by extension) and proportions sum up to 1. Only few points are on the border

Comparison between two datasets

The question we would like to tackle is the following : Are the results of linear regression linked to artefacts or due to hazard or could the macro data be really explained byu local architecture ?

Let’s do the same regression but from Keren tumors (macro) to compare if we have the same results. If they are the same, It means that the way we drove the analysis is the same across datasets and that indeed, there is an interaction between building blocks 1 and 2. If not, It means that there is a specificity regarding Wagner, a feature that shapes the Wagner tumors differently from the way Keren tumors were analyzed, because even though the samples passed through CyTOF, in the case of Wagner paper, we had a cell suspension sorted by FACS, whereas for Keren, we had a fixed 2D tumor tissue analyzed by MIBI combined with CyTOF.

Ykeren <-MacroMicroData$KerenTumors
KerenlmResults <- lm_tumors(as.matrix(Ykeren), BB1,BB2,BB3,BB4)

figk1 <-plot_ly(x=KerenlmResults[,1],
        y = KerenlmResults[,2],
        z = KerenlmResults[,3],
        type="scatter3d",
        mode="markers",
        name="Keren BC tumors",
        #color =~KerenlmResults[,4],
        #size=~KerenlmResults[,4],
        showlegend=TRUE,
        marker=list(size=7,color="green"))#,sizes=c(8,16)
bestcol <- rgb(0, 0, 255, max = 255, alpha = 125, names = "blue20")
figk1 <- figk1%>%add_mesh(x=c(1,0,0),
                        y=c(0,1,0),
                        z=c(0,0,1),
                        name = "BB plane",
                        facecolor= bestcol,
                        opacity=0.3,
                        inherit=TRUE,
                        showlegend=FALSE)
figk1 <-figk1%>%add_trace(x=results_lm[,1],
                          y = results_lm[,2],
                          z = results_lm[,3],
                          #size=~results_lm[,4],
                          type="scatter3d",
                          mode="markers",
                          name="Wagner BC tumors",
                          showlegend=TRUE,
                          inherit=FALSE,
                          marker=list(size=7, color ="red"))#,sizes=c(8,16)

figk1 <- figk1 %>%layout(scene = list(xaxis = list(title = "BB1"), yaxis = list(title = "BB2"), zaxis = list(title = "BB3") ),
                       title = "BC tumors from Keren dataset fitted in the Building Blocks space")
figk1
## Warning: 'mesh3d' objects don't have these attributes: 'mode', 'marker'
## Valid attributes include:
## 'type', 'visible', 'legendgroup', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'uirevision', 'x', 'y', 'z', 'i', 'j', 'k', 'text', 'hovertext', 'hovertemplate', 'delaunayaxis', 'alphahull', 'intensity', 'intensitymode', 'color', 'vertexcolor', 'facecolor', 'cauto', 'cmin', 'cmax', 'cmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'opacity', 'flatshading', 'contour', 'lightposition', 'lighting', 'hoverinfo', 'showlegend', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'isrc', 'jsrc', 'ksrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'intensitysrc', 'vertexcolorsrc', 'facecolorsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

This plot shows the distribution of the breast cancer tumors of 2 datasets after the linear regression of the building blocks. The aim was to assess the presence/absence of differences between the two datasets and try to explain the shape of the cloud of points of the resulting linear regression of Wagner tumors. The tumors from Keren datasets show points scattered inside the 3D-simplex without a clear shape. Nonetheless, there is a space in which we don’t find any point : the bottom part in the 2D graph (BB1;BB2) where the proportion of BB1 is higher than BB2 . It seems that there is a threshold of BB2 for which BB1 might appear. We need to investigate this through other cases of inflammation : mastitis for example. However, there are no known dataset as the ones we could get for the case of mastitis (inflammation localized in the breast tissue due to infection) in human. There is a lot of documentation in bovine mastitis because it is a major issue for milk producers, but not so much for human.

Linear regression with interaction BB1:BB2

Do the tumors samples from Keren dataset obey the same rules (including linear relationship between BB1 and BB2) than BC samples from Wagner dataset ?

With the data from Wagner paper, we could do a linear regression on the building blocks and we could see that TLS and inflammatory building block are linked to each other by a linear relationship : with a ratio of 1:3.8 (for 1 TLS, we have 3.8 inflammatory blocks). So this hypothesis can be verified by looking at how the proportions of BB1 and BB2 vary together across the tumor samples from Keren dataset. THis dataset contains only 43 patients but this is enough to check this ratio between BB1 and BB2 :

colnames(KerenlmResults) <- c("BB1","BB2","BB3","BB4")
ggplot(data=as_tibble(KerenlmResults,rownames=NA),aes(x = BB1,y = BB2))+
  geom_point()

Kerenlmbb12 <- KerenlmResults[,1:2]
KerenBB12PCA <- dudi.pca(Kerenlmbb12,scale = FALSE, center = FALSE,scannf = FALSE,nf = 2)
fviz_eig(KerenBB12PCA)

fviz_pca_biplot(KerenBB12PCA,
                col.var = 'indianred2',
                col.ind = 'steelblue2',
                invisible = 'ind',
                repel = TRUE)

coeffBB1BB2k <- KerenBB12PCA$co$Comp1 #/(sum(pcaBB1BB2$co$Comp1))
BB1BB2k <- (coeffBB1BB2k[1]*B[,1] + coeffBB1BB2k[2]*B[,2])/(coeffBB1BB2k[1]+coeffBB1BB2k[2])

We can still see that linear relationship between BB1 and BB2 in the dataset from Keren et al.,Cell(2018). Do the proportions of BB1/BB2 found in Wagner data the same here ? No, the ratio BB1/BB2 is more 1/8.4 (the coefficients are 0.107.BB1 and 0.386.BB2 for Wagner data and 0.0277.BB1 and 0.2332.BB2).

Let’s compute the new Omega matrix for Keren data :

#Binter <- B[B[]]

#linear_regression_bb <- function(Y,B){
#  OmegaT <- solve(t(B) %*% B) %*% t(B) %*% t(Y)
  # Round for easier viewing
#  Omega <- t(round(OmegaT, 3))
#  R2 <- matrix(ncol = 1,nrow=nrow(Y))
#  for (i in 1:nrow(Y)){
#    R2[i] <- cor((B%*%OmegaT[,i]),t(Y)[,i],method = "pearson")^2#1- ((apply(Y-t(residual)))^2/(sum(Y-Ymean)^2))
#  }
#  rownames(R2) <- rownames(Y)
#  colnames(R2) <- "R2"
#  res <- list("Omega"= Omega,"R2"=R2)
#  return(res)
#}
#rm(OmegaT2k)

B2k <- cbind(B,BB1BB2k)
B2k <- B2k[,c("BB3","BB4","BB1BB2k")]
B2 <- B2[,c("BB3","BB4","BB1BB2")]
OmegaT2k <- solve(t(B2) %*% B2) %*% t(B2) %*% t(Ykeren) #solve(t(B2k) %*% B2k) %*% t(B2k) %*% t(Ykeren)

#Round for easier viewing
Omega2k <- t(round(OmegaT2k, 3))
#residual <- t(t(Y)- (B2%*%OmegaT2))
#Ymean <-apply(Y, 1,function(x) mean(x))
R22k <- matrix(ncol = 1,nrow = nrow(Ykeren))
for (i in 1:nrow(Ykeren)){
  R22k[i] <- cor((B2%*%OmegaT2k[,i]),t(Ykeren)[,i],method = "pearson")^2 #cor((B2k%*%OmegaT2k[,i]),t(Ykeren)[,i],method = "pearson")^2
}
rownames(R22k) <- rownames(Ykeren)
colnames(R22k) <- "R2"

boxplot(R22k,main = "Boxplot or R^2 values of linear regression on Wagner dataset (BB1:BB2)",
        ylab = "R^2 ",
        xlab = "linear regression")

#badFit <- R2[R2[,"R2"]<0.6,] # We consider the outliers as the tumors for which the linear regression gives a Rsquare lower than 0.6
fig1 <- fig1%>%add_trace(x = as_tibble(Omega2k,rownames=NA)$BB1BB2,
                          y = as_tibble(Omega2k,rownames=NA)$BB3,
                          z = as_tibble(Omega2k,rownames=NA)$BB4,
                          #size=~results_lm[,4],
                          type = "scatter3d",
                          mode = "markers",
                          name = "Keren BC tumors",
                          showlegend = TRUE,
                          inherit = FALSE,
                          marker = list(size = 4, color ="green"))
fig1
## Warning: `line.width` does not currently support multiple values.
#BB12k <- (coeffBB1BB2[1]*D[1]+coeffBB1BB2[2]*D[2])/(coeffBB1BB2[1]+coeffBB1BB2[2])
#D[5] <- BB12k
#B2 <- cbind(B,BB1BB2)
#D12 <- D[3:5]

BB12 <- (coeffBB1BB2[1]*D[1]+coeffBB1BB2[2]*D[2])/(coeffBB1BB2[1]+coeffBB1BB2[2])
D[5] <- BB12
#B2<- cbind(B,BB1BB2)
D12 <-1/D[3:5]

Volumek <- as.matrix(Omega2k)  %*% diag(D12)
Volumek <- t(apply(Volumek,1, function(x) x/sum(x)))
colnames(Volumek) <- c("BB3","BB4","BB1BB2")
Volumek <- as_tibble(Volumek,rownames=NA)

fig2 <- fig2%>%add_trace(x = Volumek$BB1BB2,
                          y = Volumek$BB3,
                          z = Volumek$BB4,
                          #size=~results_lm[,4],
                          type = "scatter3d",
                          mode = "markers",
                          name = "Keren BC tumors",
                          showlegend = TRUE,
                          inherit = FALSE,
                          marker = list(size = 4, color ="green"))
fig2

The boxplot shows that the R-square values are scattered from 0.2 to 1.0 but most of the values are around the median which is 0.9 so the fitting is good.

Comparing with micro data of Keren dataset

averageBBSites <-SitesBBKeren%>% 
  group_by(patient_id)%>%
  summarise(meanBB1 = mean(archetype1),
            meanBB2 = mean(archetype2),
            meanBB3 = mean(archetype3),
            meanBB4 = mean(archetype4))

Now let’s plot the average proportion of building blocks for each patient collected in each site :

#plot(x = averageBBSites$meanBB1,y = averageBBSites$meanBB2,
#     xlab="BB1 (proportion)",ylab="BB2 (proportion)")
correlationBB12 <-cor(averageBBSites$meanBB1,averageBBSites$meanBB2,method="pearson")
#par(bg="transparent")
#par(bg="mintcream")
#plot(x=1,
#     xlim=c(0,0.4),
#     ylim=c(0,1),
#     xlab="",
#     ylab= "")
#abline(h=seq(0,1,0.05), v=seq(0,0.4,0.05), lty=3, col="gray")
#mtext(side=1, line=2, "Building block 1", col="gray28", font=2,cex=1.05)
#mtext(side=2, line=3, "Building block 2", col="gray28", font=2, cex=1.05)
#mtext(side=3, line=0.5, "Proportions of building blocks across tumor samples", col="gray28", font=2, cex=1.2)
#points(x=pcaBB1BB2$c1$CS1%*%t(Omega$BB1),y=pcaBB1BB2$c1$CS1%*%t(Omega$BB2),
#     col="cornflowerblue")
#legend("topright", legend=c("Wagner tumors", "Keren tumors","Keren average sites per patient"),
#       col=c("cornflowerblue", "darkorange1","brown3"), pch=1, cex=0.8)
#points(x=Kerenlmbb12$BB1,
#       y=Kerenlmbb12$BB2,
#       col="darkorange1")
#text(x=Kerenlmbb12$BB1, y=Kerenlmbb12$BB2, labels=seq(1, length(Kerenlmbb12$BB2), by=1),cex=0.7,col="darkorange1", pos = 3)
Z = (as.matrix(averageBBSites[,2:5]) %*% diag(D[1:4])) %>% apply(1, sum)




#### GGPLOT VERSION (MORE FANCY)
#Wagner.TLS.Infl <- data.frame(BB1 =as.vector(pcaBB1BB2$c1$CS1%*%t(Omega$BB1)),
#           BB2 =as.vector(pcaBB1BB2$c1$CS1%*%t(Omega$BB2)))


ggplot(data= Omega,aes(x =BB1 ,y =BB2,color="wagner"))+
  geom_point()+
  geom_point(data = Kerenlmbb12,aes(x=BB1,y=BB2,color="keren"))+
  #theme(legend.title=element_text("Dataset"))+
  scale_color_manual(values=c("cornflowerblue", "darkorange1"),
                     labels = c("Keren tumors","Wagner tumors"))+
  labs(color="Dataset") 

ggsave("./figs/BB1vsBB2_WK.pdf", height = 3, width = 4)





#points(x = (averageBBSites$meanBB1*D[1])/Z,
#       y = (averageBBSites$meanBB2*D[2])/Z,
#       col="brown3")
#text(x=(averageBBSites$meanBB1*D[1])/Z, y=(averageBBSites$meanBB2*D[2])/Z, labels=seq(1, length((averageBBSites$meanBB1*D[1])/Z), by=1),cex=0.7,col="brown3", pos = 2)
       #col = transparent("coral2", trans.val = .8))

plot(averageBBSites$meanBB1*D[1]/Z,averageBBSites$meanBB2*D[2]/Z) 

plot(SitesBBKeren$archetype1,SitesBBKeren$archetype2,xlab="",ylab="")
mtext(side=1, line=2, "Building block 1", col="gray28", font=2,cex=1.05)
mtext(side=2, line=3, "Building block 2", col="gray28", font=2, cex=1.05)
mtext(side=3, line=0.5, "Proportions of building blocks across sites sampled from TNBC", col="gray28", font=2, cex=1.2)

length(averageBBSites$meanBB1)
## [1] 40
length(Kerenlmbb12$BB1)
## [1] 43
length(Ykeren)
## [1] 430
#### Compute the contribution to the total volume with the cellular density of each building block

Here, we compute the contribution to the total volume with the cellular density of each building block, knowing that the Building block 4 is the less dense.

## PCA on building lbocks 1 and 2 in order to find a linear combination of them that maximizes the variance explained by this equation :

pca.bb12.keren <- dudi.pca(averageBBSites[,2:3],center = FALSE,scale = FALSE,scannf = FALSE,nf = 2)
fviz_pca_biplot(pca.bb12.keren,col.var = 'indianred2',col.ind = 'steelblue2',invisible = 'ind', repel = TRUE)

BB12keren.avSites <-pca.bb12.keren$co$Comp1
BB12keren.avSites
## [1] 0.04213662 0.10805141
averageBBSites <- as_tibble(averageBBSites,rownames = NA)%>%
  mutate(BB12 = (pca.bb12.keren$co$Comp1[1]*meanBB1 + pca.bb12.keren$co$Comp1[2]*meanBB2)/(pca.bb12.keren$co$Comp1[1]+pca.bb12.keren$co$Comp1[2]))

D[5] <-(pca.bb12.keren$co$Comp1[1]*D[1] + pca.bb12.keren$co$Comp1[2]*D[2])/(pca.bb12.keren$co$Comp1[1]+pca.bb12.keren$co$Comp1[2])
D12kerenSites <- D[3:5]

KerenTumorsBB <- as.matrix(averageBBSites[,c(4,5,6)])%*%diag(D12kerenSites)

KerenTumorsBBavSites  <- as_tibble(t(apply(KerenTumorsBB,1,function(x) x/sum(x))),rownames=NA)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
colnames(KerenTumorsBBavSites) <- c( "BB3","BB4","BB1BB2")
##### BUILDING BLOCK 1 AND 2 ARE NOT LINKED TOGETHER  !!


fig3 <- plot_ly(x = KerenTumorsBBavSites$BB1BB2,#averageBBSites$meanBB2,#(0.1*averageBBSites$meanBB1 + 0.38*averageBBSites$meanBB2)/(0.1+0.38),
        y = KerenTumorsBBavSites$BB3,#averageBBSites$meanBB3,
        z = KerenTumorsBBavSites$BB4,#averageBBSites$meanBB4,
        type = "scatter3d",
        mode = "markers",
        name = "Keren BC tumors",
        #color =~KerenTumorsBB[,1],#averageBBSites$meanBB1,
        #size=~KerenlmResults[,4],
        showlegend = TRUE,
        marker = list(size = 7,color = "red"))#,sizes=c(8,16)
bestcol <- rgb(0, 0, 255, max = 255, alpha = 125, names = "blue20")
fig3 <- fig3%>%add_mesh(x = c(1,0,0),
                        y = c(0,1,0),
                        z = c(0,0,1),
                        name = "BB plane",
                        facecolor = bestcol,
                        opacity = 0.3,
                        inherit = TRUE,
                        showlegend = FALSE)

fig3 <- fig3 %>%layout(scene = list(xaxis = list(title = "BB1:2"), yaxis = list(title = "BB3"), zaxis = list(title = "BB4") ),
                       title = "Average Building blocks props of volume of TNBC tumors form Keren dataset (sites sampling) ")
fig3
## Warning: 'mesh3d' objects don't have these attributes: 'mode', 'marker'
## Valid attributes include:
## 'type', 'visible', 'legendgroup', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'uirevision', 'x', 'y', 'z', 'i', 'j', 'k', 'text', 'hovertext', 'hovertemplate', 'delaunayaxis', 'alphahull', 'intensity', 'intensitymode', 'color', 'vertexcolor', 'facecolor', 'cauto', 'cmin', 'cmax', 'cmid', 'colorscale', 'autocolorscale', 'reversescale', 'showscale', 'colorbar', 'coloraxis', 'opacity', 'flatshading', 'contour', 'lightposition', 'lighting', 'hoverinfo', 'showlegend', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'isrc', 'jsrc', 'ksrc', 'textsrc', 'hovertextsrc', 'hovertemplatesrc', 'intensitysrc', 'vertexcolorsrc', 'facecolorsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
#colSet <- c("dodgerblue" = )

list("pink" = c(212, 99, 223),#D463DF
"red" = c(255,0,0),
"dodgerblue"= col2rgb("dodgerblue"),
"black" = c(0,0,0))
## $pink
## [1] 212  99 223
## 
## $red
## [1] 255   0   0
## 
## $dodgerblue
##       [,1]
## red     30
## green  144
## blue   255
## 
## $black
## [1] 0 0 0

From the micro point of view, the 2 building blocks are not very linearly dependent ==> but there is a ratio of 1/4.8 for BB1 and BB2 after a PCA on those 2 building blocks. But it seems that there is a prior inflammation in the tissue before any TLS appears. THe linear relationship is not very clear because the point of view is very reduced, whereas in the macro point of view, BB1 and 2 are very linked. Moreover, TLS is very visible in micro view whereas it is barely detectable through a cell suspension of a tumor sample. The building blocks seem to contribute homogeneously to the total volume of a tumor which is different from our observations from a macro perspective. Here, we have used the average proportions of building blocks for each tumor sample, this may induce some biases because we pass from 2D sampling to a supposedly 3D perspective ??? Explore why the points are scattered

Distribution of building blocks across patients

BBscores <- as_tibble(Omega,rownames=NA)%>%rownames_to_column(var="sample")%>%pivot_longer(c(BB1,BB2,BB3,BB4),names_to="building_block",values_to="props")
ggplot(data=BBscores,aes( x=sample,fill=building_block,y=props)) + 
  geom_bar(position="stack",stat="identity")+  #(position="fill", stat="identity")+
  labs(title="Bar charts of inferred proportions of building blocks across samples of BC")+
  theme(axis.text.x=element_blank())+ #element_text(angle=45,hjust=0.8,vjust=0.2)
  xlab("")

In this bar charts, we can see the distribution of building blocks per patient. Clearly, the building blocks of cancer and inflammation are the ones that are present in big proportions across the samples. The Building block 4 with the TLS component are often seen in lower proportions. And whenever there is a tumor containing high proportions of building block 4, the TLS component seems to be nonexistent.

library(ggridges)
#theme_set(theme_ridges())
#ggplot(BBscores, aes(y = proportion)) +
#  geom_density_ridges(aes(x = building_block,y = proportion, fill = proportion))# +
#  scale_fill_manual(values = c("#00AFBB", "#E7B800", "#FC4E07","#FC4E08"))
##Creating same dataset as BBscores but here with the building block BB1:BB2 (linear comb of BB1 and BB2):
BBscores.inter <- as_tibble(Omega2,rownames=NA)%>%rownames_to_column(var="sample")%>%pivot_longer(c(BB1BB2,BB3,BB4),names_to="building_block",values_to="props")
BBscores.Vol <- as_tibble(Volume,rownames=NA)%>%rownames_to_column(var="sample")%>%pivot_longer(c(BB1BB2,BB3,BB4),names_to="building_block",values_to="props")


gg1 <-ggplot(BBscores, aes(y =  building_block)) +
geom_density_ridges(
    aes(x = props, fill =building_block), 
    alpha = .8, color = "white", from = 0, to = 1)+
  labs(x= "proportion",
       y="building block",
       title="density of building blocks proportions across BC samples (Wagner data)")+
  scale_fill_manual(values=c("red","yellow","blue","green3"))

gg2 <-ggplot(BBscores.inter, aes(y =  building_block)) +
geom_density_ridges(
    aes(x = props, fill = building_block), 
    alpha = .8, color = "white", from = 0, to = 1)+
  labs(x= "proportion",
       y="building block",
       title="density of building blocks proportions across BC samples (Wagner)")

gg3 <- ggplot(BBscores.Vol, aes(y =  building_block)) +
geom_density_ridges(
    aes(x = props, fill = building_block), 
    alpha = .8, color = "white", from = 0, to = 1)+
  labs(x= "proportion",
       y="building block",
       title="density of building blocks (Vol proportions) across BC samples (Wagner)")
gg1
## Picking joint bandwidth of 0.0537

gg2
## Picking joint bandwidth of 0.0715

gg3
## Picking joint bandwidth of 0.0657

This graph confirms our first suppositions. The building blocks 4 and 1 are present in very low proportions for the majority of the observations. WHereas for building blocks 3 and 2 where we have a gradient : it shows that the heterogeneity of tumors relies mostly on the block of cancer and inflammatory block.

In the case where we included interaction between BB1 and BB2 (linear combination of BB1 and BB2), BB1:BB2 and BB3 contribute to inter-tumoral heterogeneity.

About volume contribution, we have the same general conclusion : building blocks contribute differently to the volume of the tumor. The cancer block is the one that contributes the most to the inter-tumor heterogeneity ==> the density is distributed almost equally across proportion from 0 to 1 with a little peak at around 0.75.

Omega2$AgeSurgery <-MacroMicroData$MetadataWagner$`Age at Surgery`
BBscores.age <-as_tibble(Omega2,rownames=NA)%>%rownames_to_column(var="sample")%>%pivot_longer(c(BB1BB2,BB3,BB4),names_to="building_block",values_to="props")
#ggplot(data=BBscores.age, aes(x=AgeSurgery, y=props,fill=building_block)) +
#  geom_bar(stat="identity",position=position_dodge())
BBscores.age1<-BBscores.age%>%filter(building_block=="BB3")
#ggplot(BBscores.age1,aes(y=props)) +
#geom_density_ridges(
#    aes(x=AgeSurgery,y =props), 
#    alpha = .8, color = "white", from = 0, to = 1)+
#  labs(x= "age",
#       y="proportion of bb",
#       title="density of BB1:BB2 with age across BC samples (Wagner)")
plot(x=BBscores.age1$AgeSurgery,y=BBscores.age1$props)
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion